Cancel Table:
Package Table:
# loading libraries
import pandas as pd
import numpy as np
import re
import pandasql
from pandasql import sqldf
import matplotlib.pyplot as plt
import seaborn as sns
import plotly
import plotly.express as px
import plotly.graph_objects as go
from sklearn.cluster import KMeans
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from feature_engine.categorical_encoders import OrdinalCategoricalEncoder
from feature_engine.variable_transformers import LogTransformer
from catboost import CatBoostClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import RepeatedStratifiedKFold
import pycountry
import warnings
warnings.filterwarnings('ignore')
# loading cancel table
cancel_table = pd.read_csv("D:\\EverythingDS\\DataSets\\DataAnalystCaseStudyData-canceltable.csv")
cancel_table.head()
# loading package table
package_table = pd.read_csv("D:\\EverythingDS\\DataSets\\DataAnalystCaseStudyData-packagetable.csv")
package_table.head()
# cancel table info
cancel_table.info()
# package table info
package_table.info()
# checking null values in cancel table
cancel_table.isnull().sum()
# cheking null values in package table
package_table.isnull().sum()
# checking for duplicate rows
cancel_duplicates = cancel_table[cancel_table.duplicated()]
cancel_duplicates
# checking redundant data in orderid
order_duplicates = cancel_table.groupby("orderid").size().reset_index()
order_duplicates[order_duplicates[0] > 1]
# checking the redundant orderid in the cancel_table
cancel_table[cancel_table["orderid"] == 222087]
# getting rid of the redundant rows
cancel_table = cancel_table.drop_duplicates(subset = "orderid", keep = "first")
# extracting userid from email
cancel_table["userid"] = cancel_table["userid"].str.extract('(\d+)')
cancel_table.head()
# converting from datetime to date
cancel_table["cancel_date"] = pd.to_datetime(cancel_table["cancel_date"])
cancel_table["cancel_date"] = cancel_table["cancel_date"].dt.date
cancel_table["cancel_date"] = pd.to_datetime(cancel_table["cancel_date"])
cancel_table.head()
# obtaining the period of the customer's stay with the company
cancel_table["customer_since"] = pd.to_datetime(cancel_table["customer_since"])
cancel_table["period"] = cancel_table["cancel_date"] - cancel_table["customer_since"]
cancel_table["period"] = cancel_table["period"] / np.timedelta64(1, 'D')
cancel_table["period"].head()
# checking for different values within order status
cancel_table["Order Status"].value_counts()
# getting rid of pending and fraud type due to the shortage of data to be considered significant
cancel_table = cancel_table[(cancel_table["Order Status"] != "Pending") & (cancel_table["Order Status"] != "Fraud")]
cancel_table["Order Status"].value_counts()
Assuming Terminated and Cancelled status to be the same sicne not enough context on dataset to consider them seperately
# generalizing cancelled and terminated status of the order (Assumption - 1)
def terminated_to_cancelled(value):
if value == "Terminated":
new_value = "Cancelled"
else:
new_value = value
return new_value
cancel_table["Order Status"] = cancel_table["Order Status"].apply(terminated_to_cancelled)
cancel_table["Order Status"].value_counts()
# checking for different values within client status
cancel_table["Client Status"].value_counts()
Assuming Closed and Inactive status to be the same sicne not enough context on dataset to consider them seperately
# generalizing closed and inactive status of the client (Assumption - 2)
def closed_to_inactive(value):
if value == "Closed":
new_value = "Inactive"
else:
new_value = value
return new_value
cancel_table["Client Status"] = cancel_table["Client Status"].apply(closed_to_inactive)
cancel_table["Client Status"].value_counts()
cancel_table["country_code"] = cancel_table["country"]
# obtaining country name from country code
def code_to_country(value):
country = pycountry.countries.get(alpha_2 = value)
name = country.name
return name
cancel_table["country"] = cancel_table["country"].apply(code_to_country)
# checking for duplicate rows
package_duplicates = package_table[package_table.duplicated()]
package_duplicates
# dropping duplicate rows
package_table = package_table.drop_duplicates()
# renaming the redundant category columns
package_table.rename(columns = {"category": "proxy_category", "category.1": "history_category"}, inplace = True)
package_table.head()
# getting rid of wrong naming of the name columns
def renaming_names(value):
if value == "10 ports Monthly":
new_value = "10 Ports Monthly"
elif value == "5 Port Proxy Plan":
new_value = "5 Ports Monthly"
elif value == "No Plan Selected":
new_value = "Default"
else:
new_value = value
new_value = re.sub(r"\s+", "_", new_value)
return new_value
package_table["name"] = package_table["name"].apply(renaming_names)
package_table["name"].value_counts()
# obtaining country name from country code
def code_to_country(value):
country = pycountry.countries.get(alpha_2 = value)
name = country.name
return name
package_table["country_name"] = package_table["country"].apply(code_to_country)
# overlapping graph for orders and customers over years
grouped = cancel_table.groupby("userid")["customer_since"].min().reset_index()
grouped["start_year"] = grouped["customer_since"].dt.year
customer_grouped = grouped.groupby("start_year")["userid"].count().reset_index()
grouped = cancel_table.copy()
grouped["start_year"] = grouped["customer_since"].dt.year
year_grouped = grouped.groupby("start_year")["orderid"].count().reset_index()
trace1 = go.Scatter(
x = year_grouped["start_year"],
y = year_grouped["orderid"],
name = "orders"
)
trace2 = go.Bar(
x = customer_grouped["start_year"],
y = customer_grouped["userid"],
name = "customers"
)
data = [trace1, trace2]
fig = go.Figure(data = data)
fig.show()
# geo map for customers from different countries
import plotly.express as px
df = cancel_table.groupby("country")["userid"].nunique().reset_index()
fig = px.choropleth(df, locations="country",
color="userid",
locationmode = "country names",
color_continuous_scale=px.colors.sequential.Plasma
)
fig.show()
# setting up the data to plot a pie chart
df.sort_values("userid", inplace = True, ascending = False)
pull = [0.1]
for i in range(109):
pull.append(0)
fig = go.Figure(data=[go.Pie(labels=df["country"], values=df["userid"],
pull = pull
)])
fig.show()
# checking where our main source of orders is from
df = cancel_table.groupby("country")["orderid"].count().reset_index()
fig = px.choropleth(df, locations="country",
color="orderid", # lifeExp is a column of gapminder
locationmode = "country names",
color_continuous_scale=px.colors.sequential.Plasma
)
fig.show()
# setting up the data to plot a pie chart
df.sort_values("orderid", inplace = True, ascending = False)
pull = [0.1]
for i in range(109):
pull.append(0)
fig = go.Figure(data=[go.Pie(labels=df["country"], values=df["orderid"],
pull = pull
)])
fig.show()
# Comparing us and other countries
df = cancel_table.copy()
def us_or_not(value):
if value == "US":
new_value = 1
else:
new_value = 0
return new_value
df["US"] = df["country_code"].apply(us_or_not)
# churn rate for us and non-us customers
grouped = df.groupby(["US", "userid"])
sub_grouped = pd.DataFrame()
for name, group in grouped:
group.sort_values("cancel_date", inplace = True, ascending = False)
new = group[["US", "userid", "country", "Client Status"]].head(1)
sub_grouped = sub_grouped.append(new, ignore_index = True)
new_grouped = sub_grouped.groupby("US")
churn = pd.DataFrame(columns = ["country", "rate"])
for name, group in new_grouped:
if name == 1:
country = "US"
else:
country = "Non-US"
churn_rate = len(group[group["Client Status"] == "Inactive"]) / len(group)
churn = churn.append({"country": country,
"rate": churn_rate}, ignore_index=True)
fig = px.bar(churn, x = "country", y = "rate", color = "country")
fig.show()
# identifying top 10 prodcuts ordered
df = cancel_table.copy()
grouped = df.groupby("product")["orderid"].count().reset_index()
grouped.sort_values("orderid", ascending = False, inplace = True)
grouped = grouped.iloc[0:10, :]
fig = px.bar(grouped, x = "product", y = "orderid")
fig.show()
# dataframe defining overall statistics of each product type
df = cancel_table.copy()
grouped = df.groupby("product")
new_df = pd.DataFrame(columns = ["Product", "Total_Count", "Active_Count", "Cancelled_Count", "Churn_Rate", "Retention_Rate"])
for name, group in grouped:
total_count = len(group)
active_count = len(group[group["Order Status"] == "Active"])
cancelled_count = len(group[group["Order Status"] == "Cancelled"])
churn_rate = cancelled_count/total_count
retention_rate = active_count/total_count
new_df = new_df.append({"Product": name,
"Total_Count": total_count,
"Active_Count": active_count,
"Cancelled_Count": cancelled_count,
"Churn_Rate": churn_rate,
"Retention_Rate": retention_rate}, ignore_index = True)
new_df.sort_values("Total_Count", ascending = False, inplace = True)
new_df = new_df.iloc[:10, :]
trace1 = go.Bar(
x = new_df["Product"],
y = new_df["Total_Count"],
name = "Total Orders"
)
trace2 = go.Bar(
x = new_df["Product"],
y = new_df["Cancelled_Count"],
name = "Cancelled"
)
trace3 = go.Bar(
x = new_df["Product"],
y = new_df["Active_Count"],
name = "Active"
)
data = [trace1, trace2, trace3]
fig = go.Figure(data = data)
fig.show()
# pie chart for category
data = package_table.copy()
df = data.groupby("history_category")["CustomerID"].count().reset_index()
df.sort_values("CustomerID", ascending = False, inplace = True)
pull = [0.1, 0, 0, 0, 0, 0, 0]
fig = go.Figure(data=[go.Pie(labels=df["history_category"], values=df["CustomerID"],
pull = pull
)])
fig.show()
# geo map for data centers
import plotly.express as px
df = package_table.groupby("country_name")["CustomerID"].count().reset_index()
fig = px.choropleth(df, locations="country_name",
color="CustomerID",
locationmode = "country names",
color_continuous_scale=px.colors.sequential.Plasma
)
fig.show()
data = package_table.copy()
df = data.groupby("country_name")["CustomerID"].count().reset_index()
df.sort_values("CustomerID", ascending = False, inplace = True)
pull = [0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
fig = go.Figure(data=[go.Pie(labels=df["country_name"], values=df["CustomerID"],
pull = pull
)])
fig.show()
data = package_table.copy()
df = data.groupby("name")["CustomerID"].count().reset_index()
fig = go.Figure(data=[go.Pie(labels=df["name"], values=df["CustomerID"],
)])
fig.show()
# defining function to get overall revenue for an order
def revenue(df):
if df["billingcycle"] == "Monthly":
value = df["amount"] * 12
elif df["billingcycle"] == "Quarterly":
value = df["amount"] * 4
elif df["billingcycle"] == "Semi-Annually":
value = df["amount"] * 2
else:
value = df["amount"]
return value
# obtaing the dataframe determining the tenure, no. of orders and revenue of each customer
df = cancel_table.copy()
df["revenue"] = df.apply(revenue, axis=1)
grouped = df.groupby("userid")
new_df = pd.DataFrame(columns = ["userid", "tenure", "orders", "revenue"])
for name, group in grouped:
revenue = group["revenue"].sum()
tenure = group["period"].sum()
orders = group["orderid"].count()
new_df = new_df.append({"userid": name,
"tenure": tenure,
"orders": orders,
"revenue": revenue}, ignore_index = True)
new_df.head()
# histogram for tenure distribution
plot_data = [
go.Histogram(
x=new_df['tenure']
)
]
plot_layout = go.Layout(
title='Tenure'
)
fig = go.Figure(data=plot_data, layout=plot_layout)
fig.show()
# removing outliers for better visual representation
new_df1 = new_df[new_df["tenure"] <= 4000]
# histogram for tenure distribution
plot_data = [
go.Histogram(
x=new_df1['tenure']
)
]
plot_layout = go.Layout(
title='Tenure'
)
fig = go.Figure(data=plot_data, layout=plot_layout)
fig.show()
# creating elbow plot to decide on right number of clusters
sse={}
tenure = new_df[['tenure']]
for k in range(1, 10):
kmeans = KMeans(n_clusters=k, max_iter=1000).fit(tenure)
tenure["clusters"] = kmeans.labels_
sse[k] = kmeans.inertia_
plt.figure()
plt.plot(list(sse.keys()), list(sse.values()))
plt.xlabel("Number of cluster")
plt.show()
# using k-means to assign clusters
kmeans = KMeans(n_clusters=4)
kmeans.fit(new_df[['tenure']])
new_df['TenureCluster'] = kmeans.predict(new_df[['tenure']])
# describing cluster info
new_df.groupby("TenureCluster")["tenure"].describe()
# defining a function to order the cluster numbers
def order_cluster(cluster_field_name, target_field_name,df,ascending):
new_cluster_field_name = 'new_' + cluster_field_name
df_new = df.groupby(cluster_field_name)[target_field_name].mean().reset_index()
df_new = df_new.sort_values(by=target_field_name,ascending=ascending).reset_index(drop=True)
df_new['index'] = df_new.index
df_final = pd.merge(df,df_new[[cluster_field_name,'index']], on=cluster_field_name)
df_final = df_final.drop([cluster_field_name],axis=1)
df_final = df_final.rename(columns={"index":cluster_field_name})
return df_final
# ordering the cluster numbers
new_df = order_cluster('TenureCluster', "tenure", new_df, False)
new_df.groupby("TenureCluster")["tenure"].describe()
# converting ddata type of orders
new_df["orders"] = new_df["orders"].astype(int)
# histogram for orders distribution
plot_data = [
go.Histogram(
x=new_df['orders']
)
]
plot_layout = go.Layout(
title='Orders'
)
fig = go.Figure(data=plot_data, layout=plot_layout)
fig.show()
# creating elbow plot to decide on right number of clusters
sse={}
orders = new_df[['orders']]
for k in range(1, 10):
kmeans = KMeans(n_clusters=k, max_iter=1000).fit(orders)
orders["clusters"] = kmeans.labels_
sse[k] = kmeans.inertia_
plt.figure()
plt.plot(list(sse.keys()), list(sse.values()))
plt.xlabel("Number of cluster")
plt.show()
# using k-means to assign clusters
kmeans = KMeans(n_clusters=4)
kmeans.fit(new_df[['orders']])
new_df['OrderCluster'] = kmeans.predict(new_df[['orders']])
# describing cluster info
new_df.groupby("OrderCluster")["orders"].describe()
# ordering the cluster numbers
new_df = order_cluster('OrderCluster', 'orders', new_df, False)
new_df.groupby("OrderCluster")["orders"].describe()
# histogram for revenue distribution
plot_data = [
go.Histogram(
x=new_df['revenue']
)
]
plot_layout = go.Layout(
title='Revenue'
)
fig = go.Figure(data=plot_data, layout=plot_layout)
fig.show()
# removing outliers for better visual representation
new_df1 = new_df[new_df["revenue"] <= float(10000)]
# histogram for revenue distribution
plot_data = [
go.Histogram(
x=new_df1['revenue']
)
]
plot_layout = go.Layout(
title='Revenue'
)
fig = go.Figure(data=plot_data, layout=plot_layout)
fig.show()
# creating elbow plot to decide on right number of clusters
sse={}
revenue = new_df[['revenue']]
for k in range(1, 10):
kmeans = KMeans(n_clusters=k, max_iter=1000).fit(revenue)
revenue["clusters"] = kmeans.labels_
sse[k] = kmeans.inertia_
plt.figure()
plt.plot(list(sse.keys()), list(sse.values()))
plt.xlabel("Number of cluster")
plt.show()
# using k-means to assign clusters
kmeans = KMeans(n_clusters=3)
kmeans.fit(new_df[['revenue']])
new_df['RevenueCluster'] = kmeans.predict(new_df[['revenue']])
# describing cluster info
new_df.groupby("RevenueCluster")["revenue"].describe()
# ordering cluster numbers
new_df = order_cluster('RevenueCluster', 'revenue', new_df, False)
new_df.groupby("RevenueCluster")["revenue"].describe()
# using k-means to assign clusters
kmeans = KMeans(n_clusters=5)
kmeans.fit(new_df[['revenue']])
new_df['RevenueCluster'] = kmeans.predict(new_df[['revenue']])
# describing cluster info
new_df.groupby("RevenueCluster")["revenue"].describe()
# ordering cluster numbers
new_df = order_cluster('RevenueCluster', 'revenue', new_df, False)
new_df.groupby("RevenueCluster")["revenue"].describe()
# displaying data with all clusters assigned
new_df.head()
# calculating overall score
new_df['OverallScore'] = new_df['TenureCluster'] + new_df['OrderCluster'] + new_df['RevenueCluster']
new_df.groupby('OverallScore')['tenure','orders','revenue'].mean().reset_index()
# defining a function to assign clusters
def segments(score):
if score in range(3, 7):
segment = "High Value Customers"
elif score in range(7, 9):
segment = "Mid Value Customers"
else:
segment = "Low Value Customers"
return segment
new_df["Segment"] = new_df["OverallScore"].apply(segments)
new_df.head()
# viewing customer counts of different segments.
new_df["Segment"].value_counts()
# ignoring outliers(very high values) for clearer visualizaton of segments
new_df = new_df.query("revenue < 80000 and tenure < 10000")
# plotting customer segments for revenue vs tenure
plot_data = [
go.Scatter(
x=new_df.query("Segment == 'Low Value Customers'")['tenure'],
y=new_df.query("Segment == 'Low Value Customers'")['revenue'],
mode='markers',
name='Low',
marker= dict(size= 7,
line= dict(width=1),
color= 'blue',
opacity= 0.8
)
),
go.Scatter(
x=new_df.query("Segment == 'Mid Value Customers'")['tenure'],
y=new_df.query("Segment == 'Mid Value Customers'")['revenue'],
mode='markers',
name='Mid',
marker= dict(size= 9,
line= dict(width=1),
color= 'green',
opacity= 0.5
)
),
go.Scatter(
x=new_df.query("Segment == 'High Value Customers'")['tenure'],
y=new_df.query("Segment == 'High Value Customers'")['revenue'],
mode='markers',
name='High',
marker= dict(size= 11,
line= dict(width=1),
color= 'red',
opacity= 0.9
)
),
]
plot_layout = go.Layout(
yaxis= {'title': "Revenue"},
xaxis= {'title': "Tenure"},
title='Customer Segments - Revenue VS Tenure',
autosize=False, width=800, height=700
)
fig = go.Figure(data=plot_data, layout=plot_layout)
fig.show()
# plotting customer segments for revenue vs order
plot_data = [
go.Scatter(
x=new_df.query("Segment == 'Low Value Customers'")['orders'],
y=new_df.query("Segment == 'Low Value Customers'")['revenue'],
mode='markers',
name='Low',
marker= dict(size= 7,
line= dict(width=1),
color= 'blue',
opacity= 0.8
)
),
go.Scatter(
x=new_df.query("Segment == 'Mid Value Customers'")['orders'],
y=new_df.query("Segment == 'Mid Value Customers'")['revenue'],
mode='markers',
name='Mid',
marker= dict(size= 9,
line= dict(width=1),
color= 'green',
opacity= 0.5
)
),
go.Scatter(
x=new_df.query("Segment == 'High Value Customers'")['orders'],
y=new_df.query("Segment == 'High Value Customers'")['revenue'],
mode='markers',
name='High',
marker= dict(size= 11,
line= dict(width=1),
color= 'red',
opacity= 0.9
)
),
]
plot_layout = go.Layout(
yaxis= {'title': "Revenue"},
xaxis= {'title': "Orders"},
title='Customer Segments - Revenue VS Orders',
autosize=False, width=800, height=700
)
fig = go.Figure(data=plot_data, layout=plot_layout)
fig.show()
# plotting customer segments for orders vs tenure
plot_data = [
go.Scatter(
x=new_df.query("Segment == 'Low Value Customers'")['orders'],
y=new_df.query("Segment == 'Low Value Customers'")['tenure'],
mode='markers',
name='Low',
marker= dict(size= 7,
line= dict(width=1),
color= 'blue',
opacity= 0.8
)
),
go.Scatter(
x=new_df.query("Segment == 'Mid Value Customers'")['orders'],
y=new_df.query("Segment == 'Mid Value Customers'")['tenure'],
mode='markers',
name='Mid',
marker= dict(size= 9,
line= dict(width=1),
color= 'green',
opacity= 0.5
)
),
go.Scatter(
x=new_df.query("Segment == 'High Value Customers'")['orders'],
y=new_df.query("Segment == 'High Value Customers'")['tenure'],
mode='markers',
name='High',
marker= dict(size= 11,
line= dict(width=1),
color= 'red',
opacity= 0.9
)
),
]
plot_layout = go.Layout(
yaxis= {'title': "Tenure"},
xaxis= {'title': "Orders"},
title='Customer Segments - Tenure VS Orders',
autosize=False, width=800, height=700
)
fig = go.Figure(data=plot_data, layout=plot_layout)
fig.show()
df = pd.read_csv("D:\\EverythingDS\\DataSets\\model_data.csv")
df.head()
# Mapping the Client Status - Active to 0 and Inactive to 1
status_map = {'Active' : 0 , 'Inactive' : 1}
df['client_status'] = df['client_status'].map(status_map)
# We have a quite a high imbalance in the dataset with a majority of Inactive customers
df["client_status"].value_counts()
# Checking the Data type of the features
df.info()
# Segregating the features to streamline preprocessing pipeline
# numerical: discrete vs continuous
discrete = [var for var in df.columns if df[var].dtype!='O' and var!='y' and df[var].nunique()<10]
continuous = [var for var in df.columns if df[var].dtype!='O' and var!='y' and var not in discrete]
# categorical
categorical = [var for var in df.columns if df[var].dtype=='O' and var!='y']
print(f'There are {len(discrete)} discrete variables : ',discrete)
print(f'There are {len(continuous)} continuous variables : ', continuous)
print(f'There are {len(categorical)} categorical variables : ',categorical)
# Considering user_id as a continuous variable may impact predictive power of a model and create noise, so it is safer to drop it
df.drop(['userid'], axis = 1, inplace = True)
continuous = [var for var in df.columns if df[var].dtype!='O' and var!='y' and var not in discrete]
print(f'There are {len(continuous)} continuous variables : ', continuous)
# We are transforming continuous varibales so that we have a distribution tending to be normal
# log1p trasforming the continuous variables since they have 0 values
df['revenue'] = df['revenue'].apply(np.log1p)
df['orders'] = df['orders'].apply(np.log1p)
df['tenure'] = df['tenure'].apply(np.log1p)
# Splitting the dataset into train and test so that we can validate if our model performs well
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df.drop('client_status', axis=1), df.client_status,
shuffle = True, stratify = df.client_status,
test_size=0.2, random_state=0)
prep_pipe = Pipeline([
# Encoding the country code column
('ordinal_enc', OrdinalCategoricalEncoder(variables=categorical, encoding_method='arbitrary')),
# Scaling
("standardscaler" , StandardScaler(with_mean=True, with_std=False)),
("robustscaler" , RobustScaler(with_centering=False, with_scaling=True, quantile_range=(0, 100)))
])
prep_pipe.fit(X_train)
X_train=prep_pipe.transform(X_train)
X_test=prep_pipe.transform(X_test)
cat = CatBoostClassifier()
param_grid = {
'loss_function' :['Logloss'],
'depth':[3,1,2,6,4,5,7,8,9,10],
'iterations':[250,100,500,1000],
'learning_rate':[0.03,0.001,0.01,0.1,0.2,0.3],
'l2_leaf_reg':[3,1,5,10,100],
'border_count':[32,5,10,20,50,100,200],
# assigning class weights to that we can resolve the imbalance in the target attribute of the dataset
'class_weights' : [[100,1], [10,80], [1,20], [1,1], [1,10], [1,100], [1,50], [1, 30]]
}
# cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
cat_grid = RandomizedSearchCV(cat, param_grid, cv = 5, n_jobs=-1, scoring = 'f1')
cat_grid.fit(X_train, y_train)
print(f'Best Params: {cat_grid.best_params_}\n')
print(f'Best Mean Cross Validation Score is {cat_grid.best_score_}\n')
print(f'Test score is {cat_grid.score(X_test,y_test)}')
from sklearn.metrics import classification_report
y_pred = cat_grid.predict(X_test)
print(classification_report(y_pred,y_test))
# confusion matrix
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
print(tn, fp, fn, tp)
We have generated a satisfactory score of our chosen metric of about 91%, but as you can see above, we have a significant number of false positive predictions. In the future I hope to resolve this by builing models that factor in the imabalance in the data set (by using Cost Sensitive, Data Sampling and Probability Calibration based models) to optimize the classification.
# attempt to find conversion rate
df = cancel_table.copy()
grouped = df.groupby(["userid", "product"])
conversion = pd.DataFrame(columns = ["userid", "product", "status"])
for name, group in grouped:
group.sort_values("customer_since", inplace = True)
orders = group["orderid"].count()
if orders > 1:
if group["Customer Type"].nunique() > 1:
type_list = list(group["Customer Type"])
conv_list = []
for i in range(len(type_list) - 1):
if type_list[i] == "Paying" and type_list[i + 1] == "Trial":
status = "converted"
conv_list.append("converted")
else:
conv_list.append("non-converted")
if conv_list.count("converted") >=1:
conversion = conversion.append({"userid": group.iloc[0]["userid"],
"product": group.iloc[0]["product"],
"status": status}, ignore_index = True)
conversion
# function for overall revenue
def revenue(df):
if df["billingcycle"] == "Monthly":
value = df["amount"] * 12
elif df["billingcycle"] == "Quarterly":
value = df["amount"] * 4
elif df["billingcycle"] == "Semi-Annually":
value = df["amount"] * 2
else:
value = df["amount"]
return value
# organizing dataset for joining
df = cancel_table.copy()
df["revenue"] = df.apply(revenue, axis=1)
# df = pd.get_dummies(df, columns = ["product"])
grouped = df.groupby("userid")
final_data = pd.DataFrame(columns = ["userid", "country_code", "orders", "revenue", "tenure", "client_status", "product"])
for name, group in grouped:
country_code = group.iloc[0]["country_code"]
orders = group["orderid"].count()
revenue = group["revenue"].sum()
tenure = group["period"].sum()
product = group["product"].nunique()
group.sort_values("cancel_date", ascending = False, inplace = True)
client_status = group.iloc[0]["Client Status"]
final_data = final_data.append({"userid": name,
"country_code": country_code,
"orders": orders,
"revenue": revenue,
"tenure": tenure,
"client_status": client_status,
"product": product}, ignore_index = True)
final_data.head()
# organizing package table for joining
df = pd.get_dummies(package_table, columns = ["name", "history_category"])
df.rename(columns = {"history_category_semi-3" : "history_category_semi"}, inplace = True)
df.head()
# reducing the package table
q = """
SELECT CustomerID, SUM(history_category_block),
SUM(history_category_mobile), SUM(history_category_rotate),
SUM(history_category_semi), SUM(history_category_shopify),
SUM(history_category_sneaker), SUM(history_category_static),
SUM(name_10_Ports_Monthly), SUM(name_100_Ports_Monthly),
SUM(name_1000_Ports_Monthly), SUM(name_25_Ports_Monthly),
SUM(name_5_Ports_Monthly), SUM(name_50_Ports_Monthly),
SUM(name_Default)
FROM df
GROUP BY CustomerID
"""
result = sqldf(q, globals())
result.head()
# calculating total packages
result["packages"] = result["SUM(name_10_Ports_Monthly)"] + result["SUM(name_100_Ports_Monthly)"] \
+ result["SUM(name_1000_Ports_Monthly)"] + result["SUM(name_25_Ports_Monthly)"] \
+ result["SUM(name_5_Ports_Monthly)"] + result["SUM(name_50_Ports_Monthly)"] \
+ result["SUM(name_Default)"]
result.head()
# joining two tables
q = """
SELECT fd.userid, fd.orders, fd.product, rd.packages
FROM final_data as fd
INNER JOIN result as rd
ON fd.userid = rd.CustomerID
"""
joined = sqldf(q, globals())
joined.head()
# list containing matches
match = list(joined["product"] == joined["packages"])
print(len(match))
print(match.count(True))